Manifold Learning Experiments

1) Loading Data

import numpy as np
import pandas as pd
import seaborn as sns
import deciphering_enigma
import matplotlib.pyplot as plt

#define the experiment config file path
path_to_config = './config.yaml'

#read the experiment config file
exp_config = deciphering_enigma.load_yaml_config(path_to_config)
dataset_path = exp_config.dataset_path

#register experiment directory and read wav files' paths
audio_files = deciphering_enigma.build_experiment(exp_config)
audio_files = [s for s in audio_files if s.endswith('mic1_normloud.wav')]
print(f'Dataset has {len(audio_files)} samples')
Dataset has 44455 samples
#extract metadata from file name convention
metadata_df, audio_format = deciphering_enigma.extract_metadata(exp_config, audio_files)
metadata_df.drop(columns=['xx', 'Label'], inplace=True)

#load audio files as torch tensors to get ready for feature extraction
audio_tensor_list = deciphering_enigma.load_dataset(audio_files, cfg=exp_config, speaker_ids=metadata_df['ID'], audio_format=audio_format)
Audio Tensors are already saved for vctk_umap_experiment
import soundfile as sf
from tqdm import tqdm
dur = []
for file in tqdm(audio_files):
    audio, sr = sf.read(file)
    dur.append(len(audio)/sr)
100%|████████████████████████████████████| 44455/44455 [05:10<00:00, 143.28it/s]

2) Generating Embeddings

#generate speech embeddings
feature_extractor = deciphering_enigma.FeatureExtractor()
embeddings_dict = feature_extractor.extract(audio_tensor_list, exp_config)
Load TERA Model
TERA embeddings are already saved for vctk_umap_experiment
(44455, 768)
import matplotlib
from pylab import cm
import matplotlib as mpl
matplotlib.font_manager._fmcache
matplotlib.font_manager._rebuild()
mpl.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['axes.linewidth'] = 3

3) Unsupervised Dimensionality Reduction

import os
import numpy as np
import pandas as pd

import scipy
from scipy.spatial.distance import pdist

from umap import UMAP
from pacmap import PaCMAP
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

from deciphering_enigma.settings import _hyperparams_grid_reducers, _optimize_function, _knn, _subsetsize

class ReducerTuner():
    """Tuner for dimensionality reduction methods.

    Implements grid-search across hyperparameters for each dimensionality reduction method preset in the settings script.
    NOTE: any method added in the settings script should follow sklearn implementation.
    Tunes reduced dimensions by optimizing local and global structure metrics.
    Saves tuned results for each method as a pandas dataframe.
    """

    def __init__(self):
        self.reducer_params_grid = _hyperparams_grid_reducers
        self.optimize_func = _optimize_function
        self.knn = _knn; self.subsetsize = _subsetsize

    def embedding_quality(self, X, Z, knn=10, subsetsize=1000):
        nbrs1 = NearestNeighbors(n_neighbors=knn).fit(X)
        ind1 = nbrs1.kneighbors(return_distance=False)

        nbrs2 = NearestNeighbors(n_neighbors=knn).fit(Z)
        ind2 = nbrs2.kneighbors(return_distance=False)

        intersections = 0.0
        for i in range(X.shape[0]):
            intersections += len(set(ind1[i]) & set(ind2[i]))
        mnn = intersections / X.shape[0] / knn

        subset = np.random.choice(X.shape[0], size=subsetsize, replace=True)
        d1 = pdist(X[subset,:])
        d2 = pdist(Z[subset,:])
        rho = scipy.stats.spearmanr(d1[:,None],d2[:,None]).correlation
        return (mnn, rho)
    
    def get_reducer(self, name):
        if name == 'PCA':
            return PCA
        elif name == 'tSNE':
            return TSNE
        elif name == 'UMAP':
            return UMAP
        elif name == 'PaCMAP':
            return PaCMAP
        else:
            raise AttributeError(f'This reducer {name} is not included...')

    def fit_eval(self, embeddings, reducer):
        stand_embeddings = StandardScaler().fit_transform(embeddings)
        reduced_embeddings = reducer.fit_transform(stand_embeddings)
        local_val, global_val = self.embedding_quality(stand_embeddings, reduced_embeddings, knn=self.knn, subsetsize=self.subsetsize)
        return reduced_embeddings, local_val, global_val
    
    def save_results_pandas(self, reducers_embeddings_dict, metadata=None, model_name=None, dataset_name=None):
        save_path = f'../{dataset_name}/{model_name}/dim_reduction_3d.csv'
        combined_column_obj = pd.MultiIndex.from_product([reducers_embeddings_dict.keys(),['Local', 'Global'], ['Dim1', 'Dim2', 'Dim3']], names=["Method", "Optimized Metric", "Dim"])
        df = pd.DataFrame(data=[], columns=combined_column_obj)
        for j, name in enumerate(reducers_embeddings_dict.keys()):
            global_embeddings = reducers_embeddings_dict[name]['Global']
            local_embeddings = reducers_embeddings_dict[name]['Local']
            df.loc[:, (name, 'Local', 'Dim1')] = local_embeddings[:,0]
            df.loc[:, (name, 'Local', 'Dim2')] = local_embeddings[:,1]
            df.loc[:, (name, 'Local', 'Dim3')] = local_embeddings[:,2]
            df.loc[:, (name, 'Global', 'Dim1')] = global_embeddings[:,0]
            df.loc[:, (name, 'Global', 'Dim2')] = global_embeddings[:,1]
            df.loc[:, (name, 'Global', 'Dim3')] = global_embeddings[:,2]
        temp_df = metadata.copy()
        temp_df.columns = pd.MultiIndex.from_tuples(map(lambda x: (x, '', ''), temp_df.columns))
        df = pd.concat([df, temp_df], axis=1)
        df.to_csv(save_path)

    def tune_reducer(self, embeddings, metadata=None, dataset_name=None, model_name=None, save_results = True, save_path='./'):
        reducers_embeddings_dict = {}
        metrics_dict = {}
        df_path = f'../{dataset_name}/{model_name}/dim_reduction_3d.csv'
        if os.path.isfile(df_path):
            print(f'Tuned Reduced Embeddings already saved for {model_name} model!')
        else:
            for i, (reducer_name, reducer_params) in enumerate(self.reducer_params_grid.items()):
                print(f'Reducer {i+1}/{len(self.reducer_params_grid.keys())}: {reducer_name}...')
                reducers_embeddings_dict[reducer_name] = {}
                reducer_object = self.get_reducer(reducer_name)
                params_iterator = list(ParameterGrid(reducer_params))
                all_embeddings = []; local_metrics = []; global_metrics = []
                for params in params_iterator:
                    print(params)
                    reducer = reducer_object(n_components=3, random_state=42, **params)
                    reduced_embeddings, local_metric, global_metric = self.fit_eval(embeddings, reducer)
                    all_embeddings.append(reduced_embeddings); local_metrics.append(local_metric); global_metrics.append(global_metric)
                max_local_idx = np.argmax(local_metrics)
                max_global_idx = np.argmax(global_metrics)
                metrics_dict[reducer_name] = {'Local': np.max(local_metrics), 'Global': np.max(global_metrics)}
                reducers_embeddings_dict[reducer_name]['Local'] = all_embeddings[max_local_idx]
                reducers_embeddings_dict[reducer_name]['Global'] = all_embeddings[max_global_idx]
            if save_results:
                self.save_results_pandas(reducers_embeddings_dict, metadata, model_name, dataset_name)
tuner = deciphering_enigma.ReducerTuner()
for i, model_name in enumerate(embeddings_dict.keys()):
    print(f'{model_name}:')
    tuner.tune_reducer(embeddings_dict[model_name], metadata=metadata_df, dataset_name=exp_config.dataset_name, model_name=model_name)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [7], in <cell line: 2>()
      1 tuner = deciphering_enigma.ReducerTuner()
----> 2 for i, model_name in enumerate(embeddings_dict.keys()):
      3     print(f'{model_name}:')
      4     tuner.tune_reducer(embeddings_dict[model_name], metadata=metadata_df, dataset_name=exp_config.dataset_name, model_name=model_name)

NameError: name 'embeddings_dict' is not defined
import plotly.express as px
import plotly.offline as py
from plotly.subplots import make_subplots
def visualize_embeddings(df, label_name, metrics=[], axis=[], acoustic_param={}, opt_structure='Local', red_name='PCA', row=1, col=1, hovertext='', label=''):
    traces = px.scatter(x=df[red_name, opt_structure, 'Dim1'], y=df[red_name, opt_structure, 'Dim2'], color=df[label_name], hover_name=df['AudioNames'])
    traces.layout.update(showlegend=False)
    axis.add_traces(
        list(traces.select_traces()),
        rows=row, cols=col
    )
optimize = 'Global'
label = 'ID'
# fig, ax = plt.subplots(1, 1, figsize=(20, 10))
fig = make_subplots(rows=1, cols=1)
model_name = 'Log-Mel-Spectrogram'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'BYOL-A_default'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'BYOL-I_default'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'BYOL-S_default'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'BYOL-S_cvt'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'Hybrid_BYOL-S_cvt'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'APC'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'TERA'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=2, cols=4)
model_name = 'Wav2Vec2_latent'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=4000,
    height=2000, showlegend=False,)
fig.show()
fig = make_subplots(rows=2, cols=4)
model_name = 'Wav2Vec2'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=4000,
    height=2000, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT_latent'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT_best'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
df['Scentence'] = df['AudioNames'].apply(lambda x: x.split('_')[1])
visualize_embeddings(df, 'Scentence', metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'HuBERT'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
df['Duration'] = np.log(dur)
df['Duration'] = df['Duration'].astype(float)
visualize_embeddings(df, 'Duration', metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=1, cols=1)
model_name = 'Data2Vec_latent'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800, showlegend=False,)
fig.show()
fig = make_subplots(rows=2, cols=4)
model_name = 'Data2Vec'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 5_level_1': '', 'Unnamed: 5_level_2': '', 'Unnamed: 6_level_1': '', 'Unnamed: 6_level_2': ''},inplace=True)
visualize_embeddings(df, label, metrics=[], axis=fig, opt_structure=optimize, red_name='UMAP')
fig.update_layout(
    autosize=False,
    width=4000,
    height=2000, showlegend=False,)
fig.show()
import plotly.express as px
import plotly.offline as py
from plotly.subplots import make_subplots
def visualize_3d_embeddings(df, label_name, metrics=[], axis=[], acoustic_param={}, opt_structure='Local', red_name='PCA', row=1, col=1, hovertext='', label=''):
    traces = px.scatter_3d(x=df[red_name, opt_structure, 'Dim1'], y=df[red_name, opt_structure, 'Dim2'], z=df[red_name, opt_structure, 'Dim3'], color=df[label_name], hover_name=df['AudioNames'])
    traces.layout.update(showlegend=False)
model_name = 'TERA'
df = pd.read_csv(f'../{exp_config.dataset_name}/{model_name}/dim_reduction_3d.csv', header=[0,1,2])
df.rename(columns={'Unnamed: 7_level_1': '', 'Unnamed: 7_level_2': '', 'Unnamed: 8_level_1': '', 'Unnamed: 8_level_2': ''},inplace=True)
fig = px.scatter_3d(x=df['UMAP', 'Global', 'Dim1'], y=df['UMAP', 'Global', 'Dim2'], z=df['UMAP', 'Global', 'Dim3'], color=df[label], hover_name=df['AudioNames'])

fig.update_layout(
    autosize=False,
    width=1000,
    height=1000, showlegend=False,)
fig.show()